I. Introduction

d <- read.table("salary.txt",sep = ",", header = TRUE)

head(d)
##     wage edu exp city       reg  race deg  com emp
## 1 354.94   7  45  yes northeast white  no 24.3 200
## 2 370.37   9   9  yes northeast white  no 26.2 130
## 3 754.94  11  46  yes northeast white  no 26.4 153
## 4 593.54  12  36  yes northeast other  no  9.9  86
## 5 377.23  16  22  yes northeast white yes  7.1 181
## 6 284.90   8  51  yes northeast white  no 11.4  32
df <- read.csv("salary.txt",header=T)

# exploratory data analysis

df$reg2 <- as.character(df$reg)
regions <- unique(df$reg2)
df$deg01[df$deg == "yes"] <- 1
df$deg01[df$deg == "no"] <- 4

#par(mfrow =c(2,2))

black <- df$race=="black"
white <- df$race=="white"
other <- df$race=="other"


df$color[black] = "red"
df$color[white] = "blue"
df$color[other] = "green"


for(r in regions){
  df.sub <- df[df$reg2 == r,]
  plot(df.sub$exp,
       log(df.sub$wage),
       #col= df$color,
       pch=df.sub$deg01,
       main = r)
  #lines(supsmu(df$edu))

lines(supsmu(df$exp[black],log(df$wage)[black]),col=df$color[black] )
lines(supsmu(df$exp[white],log(df$wage)[white]),col=df$color[white] )
lines(supsmu(df$exp[other],log(df$wage)[other]),col=df$color[other] )
legend("topleft",legend=c("Black","White","Other"), col=c(2,4,3),lty=c(1,1,1)) 
legend("topright", legend=c("College degree", "No college degree"), pch=c(1,4))
}

#http://stackoverflow.com/questions/17551193/r-color-scatter-plot-points-based-on-values


# Pick trainging and test data

# Use sample 
# Set seed to 0 
set.seed(0)
index <- sample(1:nrow(df),4965,replace = F)
train.data <- df[-index,]
data <- train.data 
test.data <- df[index,]


# Quality control check
sum(train.data$race=="black")/nrow(data)
## [1] 0.07830597
sum(test.data$race=="black")/nrow(test.data)  
## [1] 0.07633434
# Rough model 1 
summary(lm(wage~edu+exp+city+reg+race+deg+com,data=train.data))
## 
## Call:
## lm(formula = wage ~ edu + exp + city + reg + race + deg + com, 
##     data = train.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1106.0  -212.3   -51.0   141.5 18238.3 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -529.01624   21.76914 -24.301  < 2e-16 ***
## edu            58.25716    1.26061  46.214  < 2e-16 ***
## exp            10.77924    0.24169  44.599  < 2e-16 ***
## cityyes       103.89914    6.68588  15.540  < 2e-16 ***
## regnortheast   17.34384    8.45920   2.050  0.04035 *  
## regsouth      -29.67152    7.85718  -3.776  0.00016 ***
## regwest        13.20526    8.54327   1.546  0.12219    
## raceother     131.61036   12.47726  10.548  < 2e-16 ***
## racewhite     132.91059   11.02090  12.060  < 2e-16 ***
## degyes         57.63865    9.48354   6.078 1.24e-09 ***
## com             0.01068    0.35925   0.030  0.97628    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 405.1 on 19847 degrees of freedom
## Multiple R-squared:  0.2125, Adjusted R-squared:  0.2121 
## F-statistic: 535.6 on 10 and 19847 DF,  p-value: < 2.2e-16
# Rough model 2 
summary(lm(log(wage)~edu+exp+city+reg+race+deg+com,data=train.data))
## 
## Call:
## lm(formula = log(wage) ~ edu + exp + city + reg + race + deg + 
##     com, data = train.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7094 -0.3067  0.0381  0.3491  3.7425 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.3389570  0.0288166 150.572  < 2e-16 ***
## edu           0.0957897  0.0016687  57.403  < 2e-16 ***
## exp           0.0184726  0.0003199  57.738  < 2e-16 ***
## cityyes       0.1633313  0.0088503  18.455  < 2e-16 ***
## regnortheast  0.0330409  0.0111977   2.951  0.00317 ** 
## regsouth     -0.0662718  0.0104008  -6.372 1.91e-10 ***
## regwest      -0.0009764  0.0113090  -0.086  0.93120    
## raceother     0.2340043  0.0165166  14.168  < 2e-16 ***
## racewhite     0.2397341  0.0145887  16.433  < 2e-16 ***
## degyes        0.0348181  0.0125537   2.774  0.00555 ** 
## com          -0.0001071  0.0004755  -0.225  0.82177    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5363 on 19847 degrees of freedom
## Multiple R-squared:  0.2874, Adjusted R-squared:  0.287 
## F-statistic: 800.5 on 10 and 19847 DF,  p-value: < 2.2e-16
# EDA (Use smootehrs)

##### com
plot(data$com,log(data$wage))
lines(supsmu(data$com,log(data$wage)),col=2)

##### edu
plot(data$edu,log(data$wage))
lines(supsmu(data$edu,log(data$wage)),col=2)

##### edu
boxplot(log(data$wage)~data$edu)
lines(supsmu(data$edu,log(data$wage)),col=2)

##### exp
plot(data$exp,log(data$wage))
lines(supsmu(data$exp,log(data$wage)),col=2)

##### emp
plot(data$emp,log(data$wage))
lines(supsmu(data$emp,log(data$wage)),col=2)

#### Interaction plots

# City vs. region (yes)
city <- data$city
reg <- data$reg
wage <- data$wage
interaction.plot(city,reg,log(wage))

# race vs. city (maybe not)
race <- data$race
interaction.plot(race,reg,log(wage))

# race vs. city (maybe not)
interaction.plot(race,city,log(wage))

# edu vs.degree (the lines do cross) 
plot(data$edu,log(wage),col=data$deg)
degree <- data$deg=="yes"
abline(lm(log(wage)[degree]~data$edu[degree]),col="red")
abline(lm(log(wage)[-degree]~data$edu[-degree]),col="black")

# exp vs. race (the lines do cross) 
plot(data$exp,log(wage),col=data$race)

plot(data$exp,log(wage))

black <- data$race=="black"
white <- data$race=="white"
other <- data$race=="other"

lines(supsmu(data$exp[black],log(data$wage)[black]),col=2)
lines(supsmu(data$exp[white],log(data$wage)[white]),col=3)
lines(supsmu(data$exp[other],log(data$wage)[other]),col=4)
legend("topright",legend=c("Black","White","Other"),col=c(2,3,4),lty=c(1,1,1))

# Defining indicators (dummy variables)
black <- I(data$race=="black")
white <- I(data$race=="white")
other <- I(data$race=="other")
dfram <- data.frame(Wage = d$wage, Edu = d$edu, Exp = d$exp, City = d$city, Region = d$reg, Race = d$race, College = d$deg, Commute = d$com, Employees = d$emp)

head(dfram)
##     Wage Edu Exp City    Region  Race College Commute Employees
## 1 354.94   7  45  yes northeast white      no    24.3       200
## 2 370.37   9   9  yes northeast white      no    26.2       130
## 3 754.94  11  46  yes northeast white      no    26.4       153
## 4 593.54  12  36  yes northeast other      no     9.9        86
## 5 377.23  16  22  yes northeast white     yes     7.1       181
## 6 284.90   8  51  yes northeast white      no    11.4        32
black <- subset(dfram, Race == "black")
white <- subset(dfram, Race == "white")
nonblack <- subset(dfram, Race != "black")

race <- c(rep("black", nrow(black)), rep("white", nrow(white)))
wage <- c(black$Wage, white$Wage)
# remove outliers
boxplot(wage~race, main="boxplot", xlab="Race", ylab="Wage", outline=FALSE)

# log wage
boxplot(log(wage)~race, main="boxplot", xlab="Race", ylab="Wage")

#model.1 <- lm(wage~edu, data=d)
#plot(d$edu, d$wage)
#abline(-32.2755,51.5334)

#summary(model.1)
n <- length(d)
n
## [1] 9

II. Statistical Model

III. Research Question

IV. Appendix

a. Model Selection

b. Diagnostics and Model Validation

c. Influential Observations and Collinearity

d. Others